# encoding=utf-8
# 爬取runoob.com 教程整合

from StringIO import StringIO
import gzip, urllib2
from BeautifulSoup import BeautifulSoup as bs


def get(a):
    r = urllib2.Request(a)
    r.add_header('Accept-encoding', 'gzip')
    r = urllib2.urlopen(r)
    if r.info().get('Content-Encoding') == 'gzip':
        r = gzip.GzipFile(fileobj=StringIO(r.read()))
    return r.read()


def write(path, s):
    f = open(path, 'w')
    f.write(s)
    f.close()


lang = 'http'
d = 'http://www.runoob.com/' + lang + '/'
a = d + lang + '-tutorial.html'

c = []
mulu = ''
ss = ''

while True:
    b = get(a)
    # write('e:/runoob/java/' + a[len(d):], b)
    soup = bs(b)
    b = str(soup.find('div', attrs={'class': 'article-body'}))
    e = a[len(d):-5]
    print e
    if e in c: break
    c += [e]
    ss += b.replace('id="content"', 'id="' + e + '"')
    mulu += '<a href="#' + e + '">' + bs(b).find('h1').text + '</a><br />'
    b = soup.find(rel="next")
    if not b: break
    a = str(b['href'])

a = '''
<!Doctype html> <html xmlns=http://www.w3.org/1999/xhtml>
<head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/></head><body>
'''

f = open('e:/runoob/' + lang + '.html', 'w')
f.write(a)
f.write(mulu.encode('utf-8'))
f.write(ss)
f.write('</body></html>')
f.close()
